Data Information

This data was sourced from a study conducted by Boudreault et al, cited below. Alternative splicing (AS) is a method of gene regulation in some Eukaryotes which modifies the sequence of RNA transcripts. AS changes the composition of the proteins resulting from the RNA transcripts through differential choice of exons included in mature mRNAs. This results in higher variability and diversity in the cellular proteome.The study from which this dataset is lifted looks into the global changes in RNA splicing that occurs after infection with a human virus.

Boudreault S, Martenon-Brodeur C, Caron M, Garant JM, Tremblay MP, Armero VE, Durand M, Lapointe E, Thibault P, Tremblay-Létourneau M, Perreault JP, Scott MS, Lemay G, Bisaillon M. Global Profiling of the Cellular Alternative RNA Splicing Landscape during Virus-Host Interactions. PLoS One. 2016 Sep 6;11(9):e0161914.doi: 10.1371/journal.pone.0161914. PMID: 27598998; PMCID: PMC5012649. ________________________________________________________________________________

Setup

This data analysis uses additional packages beyond base R, which includes the package manager pacman.

knitr::opts_chunk$set(echo = TRUE,
                      # warning = FALSE,
                      # message = FALSE,n
                      fig.align = "center",
                      fig.height = 8)

#install.packages('pacman')
#install.packages(c("BiocManager", "RColorBrewer", "tidyverse", "devtools", "pheatmap", "gprofiler2"))
#install.packages("genekitr")
#install.packages("biomaRt")

#BiocManager::install(c("DESeq2", "org.Hs.eg.db", "EnhancedVolcano", "hypeR"))

pacman::p_load(dplyr, ggplot2, DESeq2, tidyverse, forcats, 
               readr, usedist, gridExtra, DESeq2, RColorBrewer, pheatmap,
               genekitr, clusterProfiler, EnhancedVolcano, ggfortify, biomartr,
               hypeR, gprofiler2, BiocManager)

#BiocManager::install(c("DESeq2", "org.Hs.eg.db", "EnhancedVolcano", "hypeR", "biomaRt"))


theme_set(theme_bw())

Bringing in the metadata_SRP074247.tsv file

This is reading in the metadata into a data frame and doing some preliminary data cleaning, which includes separating values and using the janitor function to clean up the metadata.

#:-)

meta <- read.table(file = './Data/SRP074247/metadata_SRP074247.tsv',
           sep = '\t',
           header = TRUE)

janitor::clean_names(meta)
##   refinebio_accession_code experiment_accession refinebio_age
## 1               SRR3471108            SRP074247            NA
## 2               SRR3471109            SRP074247            NA
## 3               SRR3471110            SRP074247            NA
## 4               SRR3471111            SRP074247            NA
## 5               SRR3471112            SRP074247            NA
## 6               SRR3471113            SRP074247            NA
## 7               SRR3471114            SRP074247            NA
## 8               SRR3471115            SRP074247            NA
## 9               SRR3471116            SRP074247            NA
##   refinebio_cell_line refinebio_compound refinebio_developmental_stage
## 1                l929                 NA                            NA
## 2                l929                 NA                            NA
## 3                l929                 NA                            NA
## 4                l929                 NA                            NA
## 5                l929                 NA                            NA
## 6                l929                 NA                            NA
## 7                l929                 NA                            NA
## 8                l929                 NA                            NA
## 9                l929                 NA                            NA
##   refinebio_disease refinebio_disease_stage refinebio_genetic_information
## 1                NA                      NA                            NA
## 2                NA                      NA                            NA
## 3                NA                      NA                            NA
## 4                NA                      NA                            NA
## 5                NA                      NA                            NA
## 6                NA                      NA                            NA
## 7                NA                      NA                            NA
## 8                NA                      NA                            NA
## 9                NA                      NA                            NA
##   refinebio_organism                      refinebio_platform
## 1       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 2       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 3       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 4       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 5       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 6       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 7       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 8       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 9       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
##   refinebio_processed refinebio_processor_id refinebio_processor_name
## 1                True                    381                 Tximport
## 2                True                    381                 Tximport
## 3                True                    381                 Tximport
## 4                True                    381                 Tximport
## 5                True                    381                 Tximport
## 6                True                    381                 Tximport
## 7                True                    381                 Tximport
## 8                True                    381                 Tximport
## 9                True                    381                 Tximport
##   refinebio_processor_version refinebio_race refinebio_sex
## 1              v1.25.8-hotfix             NA            NA
## 2              v1.25.8-hotfix             NA            NA
## 3              v1.25.8-hotfix             NA            NA
## 4              v1.25.8-hotfix             NA            NA
## 5              v1.25.8-hotfix             NA            NA
## 6              v1.25.8-hotfix             NA            NA
## 7              v1.25.8-hotfix             NA            NA
## 8              v1.25.8-hotfix             NA            NA
## 9              v1.25.8-hotfix             NA            NA
##   refinebio_source_archive_url refinebio_source_database
## 1                           NA                       SRA
## 2                           NA                       SRA
## 3                           NA                       SRA
## 4                           NA                       SRA
## 5                           NA                       SRA
## 6                           NA                       SRA
## 7                           NA                       SRA
## 8                           NA                       SRA
## 9                           NA                       SRA
##                         refinebio_specimen_part refinebio_subject
## 1 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 2 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 3 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 4 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 5 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 6 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 7 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 8 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 9 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
##   refinebio_time                                  refinebio_title
## 1             NA                          Mock Cells, Replicate 1
## 2             NA                          Mock Cells, Replicate 2
## 3             NA                          Mock Cells, Replicate 3
## 4             NA        Cells infected with Reovirus, replicate 1
## 5             NA        Cells infected with Reovirus, replicate 2
## 6             NA        Cells infected with Reovirus, replicate 3
## 7             NA Cells infected with Mutant Reovirus, replicate 1
## 8             NA Cells infected with Mutant Reovirus, replicate 2
## 9             NA Cells infected with Mutant Reovirus, replicate 3
##   refinebio_treatment meta_sra_age
## 1                  NA          100
## 2                  NA          100
## 3                  NA          100
## 4                  NA          100
## 5                  NA          100
## 6                  NA          100
## 7                  NA          100
## 8                  NA          100
## 9                  NA          100

Clean Meta file

It is important to clean the metadata so the data analysis is accurate and streamlined. meta_clean is a cleaned version of the meta data frame. The treatment variables we are interested in are Mock, Reovirus, and Mutant. The names in meta were cleaned to only include the treatment type and the replicate number. First the treatment type and replicate numbers are separated out into their own columns, and then recombined for a cleaner view of what each column is telling us.

meta_clean <-
  
meta |>
  # Changing columns to choose based on how our dataset is stored
  mutate('sample_ID' = refinebio_accession_code,
         'title' = refinebio_title,
         'treatment' = refinebio_title,
         'replicate' = refinebio_title,
         )

for(i in 1:nrow(meta_clean)){
  if(grepl('Mock', meta_clean$title[i])){
    meta_clean$treatment[i] <- 'uninfected'}
  if(grepl('with Reovirus', meta_clean$title[i])){
    meta_clean$treatment[i] <- 'infected'}
  if(grepl('Mutant', meta_clean$title[i])){
    meta_clean$treatment[i] <- 'mutant'}
}

for(i in 1:nrow(meta_clean)){
  if(grepl('1', meta_clean$title[i])){
    meta_clean$replicate[i] <- '1'}
  if(grepl('2', meta_clean$title[i])){
    meta_clean$replicate[i] <- '2'}
  if(grepl('3', meta_clean$title[i])){
    meta_clean$replicate[i] <- '3'}
}

meta_clean <-
  meta_clean |>
  filter(treatment %in% c('uninfected', 'infected')) |>
  mutate('sample' = paste(treatment, replicate, sep = "_")) |>
  select(sample_ID, sample, title, treatment, replicate) |>
  janitor::clean_names()

meta_clean
##    sample_id       sample                                     title  treatment
## 1 SRR3471108 uninfected_1                   Mock Cells, Replicate 1 uninfected
## 2 SRR3471109 uninfected_2                   Mock Cells, Replicate 2 uninfected
## 3 SRR3471110 uninfected_3                   Mock Cells, Replicate 3 uninfected
## 4 SRR3471111   infected_1 Cells infected with Reovirus, replicate 1   infected
## 5 SRR3471112   infected_2 Cells infected with Reovirus, replicate 2   infected
## 6 SRR3471113   infected_3 Cells infected with Reovirus, replicate 3   infected
##   replicate
## 1         1
## 2         2
## 3         3
## 4         1
## 5         2
## 6         3

Counts Matrix

The counts matrix is a data frame which contains the actual data output from the RNA-seq experiment, and will be used for the differential gene expression analysis. _________________________________________________________________________________

Reading in the counts matrix

# Read in counts matrix
data <- read.table(file = './Data/SRP074247/SRP074247.tsv',
           sep = '\t',
           header = TRUE,
           row.names = 1) |>
  select(-c(SRR3471114, SRR3471115, SRR3471116))

head(data)
##                    SRR3471108   SRR3471109   SRR3471110  SRR3471111
## ENSMUSG00000000001 19889.8320 19485.824000 15567.427000 16893.19700
## ENSMUSG00000000003     0.0000     0.000000     0.000000     0.00000
## ENSMUSG00000000028   966.2573  1009.518100   790.992900   544.20966
## ENSMUSG00000000031     0.0000     0.000000     0.000000     0.00000
## ENSMUSG00000000037   186.0445   196.689480   166.312350    73.05573
## ENSMUSG00000000049     0.0000     1.311085     0.703601     4.89068
##                      SRR3471112  SRR3471113
## ENSMUSG00000000001 16797.210000 13591.75300
## ENSMUSG00000000003     0.000000     0.00000
## ENSMUSG00000000028   695.027340   492.18677
## ENSMUSG00000000031     0.000000     0.00000
## ENSMUSG00000000037    78.764824    76.60153
## ENSMUSG00000000049     1.354603     0.00000
identical(colnames(data), meta_clean$sample_id)
## [1] TRUE

Changing the Column Names

This is renaming the columns created above for treatment type (infected or uninfected) and replicate number on the counts matrix

colnames(data) <- paste(meta_clean$treatment, meta_clean$replicate, sep = "_")

tibble(data)
## # A tibble: 41,249 × 6
##    uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2 infected_3
##           <dbl>        <dbl>        <dbl>      <dbl>      <dbl>      <dbl>
##  1       19890.     19486.      15567.      16893.     16797.      13592. 
##  2           0          0           0           0          0           0  
##  3         966.      1010.        791.        544.       695.        492. 
##  4           0          0           0           0          0           0  
##  5         186.       197.        166.         73.1       78.8        76.6
##  6           0          1.31        0.704       4.89       1.35        0  
##  7         831.       830.        617.        254.       213.        181. 
##  8        3131.      3077.       2470.       2406.      2472.       2053. 
##  9        5645.      5921.       4623.      18342.     19434.      15835. 
## 10         714.       843.        781.        675.       706.        594. 
## # ℹ 41,239 more rows

This is adding the gene name to each row

data_clean <- rownames_to_column(data, 'gene')

tibble(data_clean)
## # A tibble: 41,249 × 7
##    gene  uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2 infected_3
##    <chr>        <dbl>        <dbl>        <dbl>      <dbl>      <dbl>      <dbl>
##  1 ENSM…       19890.     19486.      15567.      16893.     16797.      13592. 
##  2 ENSM…           0          0           0           0          0           0  
##  3 ENSM…         966.      1010.        791.        544.       695.        492. 
##  4 ENSM…           0          0           0           0          0           0  
##  5 ENSM…         186.       197.        166.         73.1       78.8        76.6
##  6 ENSM…           0          1.31        0.704       4.89       1.35        0  
##  7 ENSM…         831.       830.        617.        254.       213.        181. 
##  8 ENSM…        3131.      3077.       2470.       2406.      2472.       2053. 
##  9 ENSM…        5645.      5921.       4623.      18342.     19434.      15835. 
## 10 ENSM…         714.       843.        781.        675.       706.        594. 
## # ℹ 41,239 more rows

Pivoting the data frame

Initially, the read matrix had one row for every gene, and each column was the read count for each treatment and replicate. By pivoting the data, we now are able to have more than one row for each gene (there are now nine rows for each gene, as there are nine sample treatment types), and the columns are divided by sample, treatment, replicate, gene, and reads. This way, the data is easier to assess and the data frame contains more information than just the read counts.

data_long <-
  pivot_longer(data = data_clean,
               cols = -gene,
               names_to = 'ID',
               values_to = 'reads') |>
  
  mutate('sample' = ID) |>

  separate(col = 'ID',
           into = c('treatment', 'replicate'),
           sep = '_') |>
  
  mutate(treatment = as.factor(treatment),
         replicate = as.factor(replicate)) |>
  
  select(sample, treatment, replicate, gene, reads)

tibble(data_long)
## # A tibble: 247,494 × 5
##    sample       treatment  replicate gene                reads
##    <chr>        <fct>      <fct>     <chr>               <dbl>
##  1 uninfected_1 uninfected 1         ENSMUSG00000000001 19890.
##  2 uninfected_2 uninfected 2         ENSMUSG00000000001 19486.
##  3 uninfected_3 uninfected 3         ENSMUSG00000000001 15567.
##  4 infected_1   infected   1         ENSMUSG00000000001 16893.
##  5 infected_2   infected   2         ENSMUSG00000000001 16797.
##  6 infected_3   infected   3         ENSMUSG00000000001 13592.
##  7 uninfected_1 uninfected 1         ENSMUSG00000000003     0 
##  8 uninfected_2 uninfected 2         ENSMUSG00000000003     0 
##  9 uninfected_3 uninfected 3         ENSMUSG00000000003     0 
## 10 infected_1   infected   1         ENSMUSG00000000003     0 
## # ℹ 247,484 more rows
colors <- c('infected' = '#FA8633',
            'uninfected' = 'lightseagreen')

Checking the quality of the dataset

For gene expression profiling experiments, we hope for between 10-25 million reads per sample. Below is a bar plot showing the sum of the total reads for each sample

We must find if there is a significant different in the total counts by treatment

sum_by_sample <-
  data_long |> 
  group_by(treatment, sample) |> 
  summarize('sum' = sum(reads)) |>
  ungroup()
## `summarise()` has grouped output by 'treatment'. You can override using the
## `.groups` argument.
ggplot(data = data_long,
       mapping = aes(x = sample)) +
          
  geom_col(data = sum_by_sample,
           mapping = aes(y = sum,
                         fill = treatment),
           color = 'black') +
  
  labs(x = 'Sample',
       y = 'Total Reads',
       title = 'Assessing the quality of the dataset using number of reads',
       caption = 'Data from refine.bio\nAccession ID: SRP074247',
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16),
        axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_text(size = 14),
        legend.position = 'right',
        plot.caption = element_text(face = 'italic')) +
  
  scale_fill_manual(values = colors) +
  
  scale_y_continuous(expand = c(0.01, 0))

Boxplot of the number of reads by sample type

ggplot(data = sum_by_sample |> group_by(treatment) |> summarize('sum' = sum(sum)),
       mapping = aes(x = sum, 
                     y = treatment,
                     fill = treatment)) +
  
  geom_boxplot(data = data_long |> group_by(reads),
               mapping = aes(x = log2(reads))) +
  
  
  scale_fill_manual(values = colors) +
  
  labs(title = 'Number of reads by sample type',
       x = 'log2(reads)',
       y = NULL,
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16))
## Warning: Removed 91153 rows containing non-finite values (`stat_boxplot()`).

Evalutating statistical significance

We are conducting a T-Test to evaluate whether or not there is a statistical difference in counts by treatment. We are hoping for a high p-value which will indicate no statistical significance in the difference in read counts by treatment.

Rather than conducting a T-test to establish statistical significance, we will use Analysis of variance (ANOVA). This is because we have 3 treatment variables, and a T test is only able to analyze significance between 2 variables.

t.test(colSums(data) ~ meta_clean$treatment)
## 
##  Welch Two Sample t-test
## 
## data:  colSums(data) by meta_clean$treatment
## t = 1.2091, df = 3.9817, p-value = 0.2935
## alternative hypothesis: true difference in means between group infected and group uninfected is not equal to 0
## 95 percent confidence interval:
##  -3614589  9173758
## sample estimates:
##   mean in group infected mean in group uninfected 
##                 31201792                 28422207

Histogram of reads by gene

sum_by_gene <-
  data_long |> 
  group_by(gene) |> 
  summarize('sum' = sum(reads)) |>
  ungroup()

ggplot(data = sum_by_gene,
       mapping = aes(x = log2(sum))) +
  
  geom_histogram(bins = 15,
                 color = 'black',
                 fill = '#9A6DA3') +
  
  labs(x = 'Log-2 transformed counts per gene',
       y = 'Number of genes',
       title = 'Number of reads by gene') +
  
   theme(plot.title = element_text(hjust = 0.5,
                                  size = 16))
## Warning: Removed 9696 rows containing non-finite values (`stat_bin()`).

Exclusing low abundance genes

high_abun_genes <-
  data_long |>
  pivot_wider(names_from = gene,
              values_from = reads) |>
  select(where(~ all(. != 0)))

high_abun_long <- 
  high_abun_genes |>
  pivot_longer(cols = -c(sample, treatment, replicate),
               names_to = 'gene',
               values_to = 'reads')

high_abun_sum <-
  high_abun_long |>
  group_by(gene) |> 
  summarize('sum' = sum(reads)) |>
  ungroup()

ggplot(data = high_abun_sum,
       mapping = aes(x = log2(sum))) +
  
  geom_histogram(bins = 15,
                 color = 'black',
                 fill = '#9A6DA3') +
  
  labs(x = 'Log-2 transformed counts per gene',
       y = 'Number of genes',
       title = 'Number of reads by gene (excluding low abundance genes)') +
  
   theme(plot.title = element_text(hjust = 0.5,
                                  size = 16))

non_normal_graph <-
  
ggplot(data = high_abun_long,
       mapping = aes(x = sample)) +
          
  geom_boxplot(mapping = aes(y = log2(reads),
                             fill = treatment),
              color = 'black') +
  
  labs(x = 'Sample ID',
       y = 'Log 2 transformed counts per gene',
       title = 'Non-normalized counts',
       caption = 'Data from refine.bio\nAccession ID: SRP074247',
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16),
        axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_text(size = 14),
        legend.position = 'right',
        plot.caption = element_text(face = 'italic')) +
  
  scale_fill_manual(values = colors) +
  
  scale_y_continuous(expand = c(0.01, 0))

non_normal_graph

sample_names <- c('uninfected_1', 'uninfected_2', 'uninfected_3', 'infected_1', 'infected_2', 'infected_3')
high_abun_log2 <-
  high_abun_long |> 
  mutate(reads = log2(reads)) |>
  pivot_wider(names_from = gene,
              values_from = reads)

high_abun_log2_flipped <-
  data.frame(t(high_abun_log2 |> select(-c(treatment, replicate)))) |> janitor::row_to_names(row_number = 1) |>
  mutate_all(function(x) as.numeric(as.character(x)))


data_dist <- dist(high_abun_log2, method = "euclidean")
## Warning in dist(high_abun_log2, method = "euclidean"): NAs introduced by
## coercion
data_dist <- dist_setNames(data_dist, sample_names)


plot(hclust(data_dist, method = "complete"), xlab = "Sample", main = NULL)

sample_medians <- (apply(high_abun_log2_flipped, 2, median))

grand_median <- mean(sample_medians)

correction_factors <- grand_median - sample_medians

corrections <- data.frame(correction_factors)

corrections
##              correction_factors
## uninfected_1         -0.1308265
## uninfected_2         -0.1311785
## uninfected_3          0.1408655
## infected_1           -0.0419730
## infected_2           -0.0345170
## infected_3            0.1976295
norm_data <- high_abun_log2_flipped

for(i in 1:ncol(norm_data)){
  norm_data[,i] <- high_abun_log2_flipped[,i] + corrections$correction_factors[i]
}
norm_data_long <-
  norm_data |> rownames_to_column('gene') |>
  pivot_longer(cols = -gene,
               names_to = 'ID',
               values_to = 'normalized_reads') |>
  
  mutate('sample' = ID) |>

  separate(col = 'ID',
           into = c('treatment', 'replicate'),
           sep = '_') |>
  
  mutate(treatment = as.factor(treatment),
         replicate = as.factor(replicate)) |>
  
  select(sample, treatment, replicate, gene, normalized_reads)
normal_graph <-
  
ggplot(data = norm_data_long,
       mapping = aes(x = sample)) +
          
  geom_boxplot(mapping = aes(y = normalized_reads,
                             fill = treatment),
              color = 'black') +
  
  labs(x = 'Sample ID',
       y = 'Log 2 transformed counts per gene',
       title = 'Normalized counts',
       caption = 'Data from refine.bio\nAccession ID: SRP074247',
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16),
        axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_text(size = 14),
        legend.position = 'right',
        plot.caption = element_text(face = 'italic')) +
  
  scale_fill_manual(values = colors) +
  
  scale_y_continuous(expand = c(0.01, 0))

normal_graph

grid.arrange(non_normal_graph, normal_graph, ncol=2)

norm_log2 <-
  norm_data_long |>
  mutate(normalized_reads = log2(normalized_reads)) |>
  pivot_wider(names_from = gene,
              values_from = normalized_reads)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `normalized_reads = log2(normalized_reads)`.
## Caused by warning:
## ! NaNs produced
norm_dist <- dist(norm_log2, method = "euclidean")
## Warning in dist(norm_log2, method = "euclidean"): NAs introduced by coercion
norm_dist <- dist_setNames(norm_dist, sample_names)

plot(hclust(norm_dist, method = "complete"), xlab = "Sample", main = NULL)

PCA <- prcomp(t(norm_dist))

autoplot(PCA, data = data_long, color = 'treatment', main = "Colored by Treatment", size = 3)
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

data_round <- 
  round(data.matrix(data))

#Make sure order is correct
head(data_round,2)
##                    uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2
## ENSMUSG00000000001        19890        19486        15567      16893      16797
## ENSMUSG00000000003            0            0            0          0          0
##                    infected_3
## ENSMUSG00000000001      13592
## ENSMUSG00000000003          0
meta_clean[, c('sample')]
## [1] "uninfected_1" "uninfected_2" "uninfected_3" "infected_1"   "infected_2"  
## [6] "infected_3"
dds <- DESeqDataSetFromMatrix(countData = data_round,           
                                       colData = meta_clean,
                                       design = ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# pre-filter to remove low-read rows
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds$treatment <- relevel(dds$treatment, ref = "uninfected")
# run Differential Expression Analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)

head(res)
## log2 fold change (MLE): treatment infected vs uninfected 
## Wald test p-value: treatment infected vs uninfected 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000000001 16796.973      -0.185684 0.0467589  -3.97109 7.15435e-05
## ENSMUSG00000000028   737.681      -0.647278 0.1004754  -6.44215 1.17792e-10
## ENSMUSG00000000037   127.985      -1.227260 0.1929378  -6.36091 2.00559e-10
## ENSMUSG00000000056   477.308      -1.780200 0.1182042 -15.06039 2.95050e-51
## ENSMUSG00000000058  2565.718      -0.291981 0.0559741  -5.21637 1.82468e-07
## ENSMUSG00000000078 11555.826       1.759546 0.0458610  38.36696 0.00000e+00
##                           padj
##                      <numeric>
## ENSMUSG00000000001 2.38709e-04
## ENSMUSG00000000028 6.52838e-10
## ENSMUSG00000000037 1.09498e-09
## ENSMUSG00000000056 7.59753e-50
## ENSMUSG00000000058 7.89931e-07
## ENSMUSG00000000078 0.00000e+00
print('Dimensions:')
## [1] "Dimensions:"
dim(res)
## [1] 23799     6
vsd <- vst(dds, blind=FALSE)

plotPCA(vsd, intgroup=c("treatment")) + theme_classic() + scale_color_manual(values = colors)
## using ntop=500 top features by variance

rld <- rlog(dds, blind=FALSE)
rld_mat <- assay(rld)  
rld_cor <- cor(rld_mat) 

heat.colors <- brewer.pal(6, "Blues")
pheatmap(rld_cor, color = heat.colors,
         breaks = c(0.97, 0.99, 0.995, 0.999, 0.9995, 0.99995))

infected_n_res <- results(dds, contrast = c("treatment", "infected", "uninfected"))
infected_n_sigs <- na.omit(infected_n_res)
infected_n_sigs <- subset(infected_n_sigs, padj < 0.05 & abs(log2FoldChange) > 1)
infected_n_sig_data <- merge(data.frame(infected_n_sigs),
                        data.frame(counts(dds, normalized = TRUE)), 
                        by = "row.names", sort=FALSE)

names(infected_n_sig_data)[1] <- "Ensembl_ID"

infected_n_res_data <- merge(data.frame(infected_n_res),
                        data.frame(counts(dds, normalized = TRUE)),
                        by="row.names", sort=FALSE)

names(infected_n_res_data)[1] <- "Ensembl_ID"  

head(res)
## log2 fold change (MLE): treatment infected vs uninfected 
## Wald test p-value: treatment infected vs uninfected 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000000001 16796.973      -0.185684 0.0467589  -3.97109 7.15435e-05
## ENSMUSG00000000028   737.681      -0.647278 0.1004754  -6.44215 1.17792e-10
## ENSMUSG00000000037   127.985      -1.227260 0.1929378  -6.36091 2.00559e-10
## ENSMUSG00000000056   477.308      -1.780200 0.1182042 -15.06039 2.95050e-51
## ENSMUSG00000000058  2565.718      -0.291981 0.0559741  -5.21637 1.82468e-07
## ENSMUSG00000000078 11555.826       1.759546 0.0458610  38.36696 0.00000e+00
##                           padj
##                      <numeric>
## ENSMUSG00000000001 2.38709e-04
## ENSMUSG00000000028 6.52838e-10
## ENSMUSG00000000037 1.09498e-09
## ENSMUSG00000000056 7.59753e-50
## ENSMUSG00000000058 7.89931e-07
## ENSMUSG00000000078 0.00000e+00
ids <- as.character(infected_n_res_data$Ensembl_ID)

gene_list <- biomaRt::getBM(filter = 'ensembl_gene_id',
                  attributes = c("ensembl_gene_id","uniprot_gn_symbol"), 
                  values = ids,
                  mart = biomaRt::useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl"))
infected_n_res_data <- merge(infected_n_res_data, gene_list, by.x="Ensembl_ID", by.y="ensembl_gene_id")

infected_n_sig_data <- merge(infected_n_sig_data, gene_list, by.x="Ensembl_ID", by.y="ensembl_gene_id")
#output tabular files
write.csv(infected_n_res_data, 
          quote = FALSE,
          file = "infected_vs_uninfected_normalized_matrix.csv")

#infected_n_sig_data <- subset(infected_n_res_data_gene, 
                                 # padj < 0.05 & abs(log2FoldChange) > 1)

write.csv(infected_n_sig_data, 
          quote = FALSE,
          file = "infected_vs_uninfected_deg_padj0.05_log2fc1.csv")
EnhancedVolcano(infected_n_res_data,
                lab = as.character(infected_n_res_data$uniprot_gn_symbol),
                x = 'log2FoldChange',
                y = 'padj', 
                title = "SLE versus healthy controls",
                subtitle = "Differential expression",
                caption = paste0("Upregulated = 330, Downregulated = 222"),
                xlim = c(-10,10),
                ylim = c(0,20),
                FCcutoff = 1,
                pCutoff = 0.05,
                labSize = 4, 
                axisLabSize = 12, 
                col=c('grey40', 'grey40', 'grey40', '#FA8633'),
                legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 5.0,
                gridlines.major = FALSE,
                gridlines.minor = FALSE)
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...

# plotting upregulated gene

d <- plotCounts(dds, gene="ENSMUSG00000024678", intgroup="treatment", returnData=TRUE)

# Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels)
ggplot(d, aes(x = treatment, y = count, color = treatment)) + 
  geom_point(position=position_jitter(w = 0.1,h = 0),
             size = 2.5) +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(values = colors) +
  labs(title = "Ms4a4d",
       x = 'Treatment',
       y = 'Count')

# plotting downregulated gene
d <- plotCounts(dds, gene="ENSMUSG00000056328", intgroup="treatment", returnData=TRUE)

# Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels)
ggplot(d, aes(x = treatment, y = count, color = treatment)) + 
  geom_point(position=position_jitter(w = 0.1,h = 0),
             size = 2.5) +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(values = colors) +
  labs(title = "Myh1",
       x = 'Treatment',
       y = 'Count')

## Order results by padj values
top20_sigOE_genes <- infected_n_res_data |>
        arrange(padj) |>    #Arrange rows by padj values
        pull(uniprot_gn_symbol) |>      #Extract character vector of ordered genes
        head(n=20)      #Extract the first 20 genes

## normalized counts for top 20 significant genes
top20_sigOE_norm <- infected_n_sig_data |>
        filter(uniprot_gn_symbol %in% top20_sigOE_genes)

# Gathering the columns to have normalized counts to a single column
gathered_top20_sigOE <- top20_sigOE_norm |>
  gather(colnames(top20_sigOE_norm)[8:13], key = "sample", value = "normalized_counts")

## check the column header in the "gathered" data frame
head(gathered_top20_sigOE)
##           Ensembl_ID  baseMean log2FoldChange      lfcSE     stat pvalue padj
## 1 ENSMUSG00000000078 11555.826       1.759546 0.04586097 38.36696      0    0
## 2 ENSMUSG00000000275 12527.828       3.706646 0.07581458 48.89094      0    0
## 3 ENSMUSG00000000386 15980.147      10.206198 0.16649298 61.30107      0    0
## 4 ENSMUSG00000001123  3666.440       3.923118 0.06682685 58.70572      0    0
## 5 ENSMUSG00000002307  4991.193       3.812616 0.08460128 45.06570      0    0
## 6 ENSMUSG00000002325  1925.976       3.598447 0.07621423 47.21490      0    0
##   uniprot_gn_symbol       sample normalized_counts
## 1              Klf6 uninfected_1        5252.72999
## 2            Trim25 uninfected_1        1903.82384
## 3               Mx1 uninfected_1          37.22041
## 4            Lgals9 uninfected_1         443.85336
## 5              Daxx uninfected_1         551.79254
## 6              Irf9 uninfected_1         303.34632
gathered_top20_sigOE <- inner_join(meta_clean, gathered_top20_sigOE)
## Joining with `by = join_by(sample)`
## plot using ggplot2

ggplot(data = gathered_top20_sigOE,
       mapping = aes(x = as.character(uniprot_gn_symbol), y = normalized_counts, color = treatment)) +
  geom_point() +
  scale_y_log10() +
  xlab("Gene") +
  ylab("log10 Normalized Counts") +
  ggtitle("Top 20 Significant DE Genes") +
  theme_bw() +
  scale_color_manual(values = colors) +
      
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5))

KEGG <- msigdb_gsets(species="Mus musculus", 
                     category="C2", 
                     subcategory="CP:KEGG",
                     clean = TRUE)
hyp_kegg <- hypeR(gene_list$uniprot_gn_symbol, 
                  KEGG, 
                  test="hypergeometric", 
                  background=50000, 
                  fdr=0.05, 
                  plotting=TRUE)
hyp_dots(hyp_kegg, 
         title="KEGG", 
         abrv=30, 
         val = "fdr")

HALLMARK <- msigdb_gsets("Mus musculus", 
                         "H", 
                         "",
                         clean = TRUE)

# Hallmark
hyp_Hallmark <- hypeR(gene_list$uniprot_gn_symbol, 
                      HALLMARK, 
                      test="hypergeometric", 
                      background=50000, 
                      fdr=0.05, 
                      plotting=TRUE)

#show top pathway
hyp_dots(hyp_Hallmark, 
         title="HALLMARK", 
         abrv=50, 
         val = "fdr")

GOBP<- msigdb_gsets(species = "Mus musculus", 
                    "C5", 
                    "GO:BP",
                    clean = TRUE)
# GOBP
hyp_GOBP <- hypeR(gene_list$uniprot_gn_symbol, 
                  GOBP, 
                  test="hypergeometric", 
                  background=50000, 
                  fdr=0.05, 
                  plotting=TRUE)

#show top pathway
hyp_dots(hyp_GOBP, title="GO: Biological Pathways", abrv=50, val = "fdr")
## Warning: Transformation introduced infinite values in continuous y-axis

up_degs <- subset(infected_n_sig_data, log2FoldChange > 0)
up_degs <- up_degs$uniprot_gn_symbol
up_degs <- na.omit(up_degs)


down_degs <- subset(infected_n_sig_data, log2FoldChange < 0)
down_degs <- down_degs$uniprot_gn_symbol
down_degs <- na.omit(down_degs)
hyp_KEGG_up <- hypeR(up_degs,
                     KEGG, 
                     test="hypergeometric", 
                     background=50000, 
                     fdr=0.05, 
                     plotting=TRUE)

#show top pathway
#plot_KEGG_up <- 
  hyp_dots(hyp_KEGG_up, title="KEGG Pathways Upregulated in SLE patients", abrv=50, val = "fdr")

#ggsave("Dotplot_KEGG_upregulated.png")
# KEGG_down
hyp_KEGG_down <- hypeR(down_degs, 
                       KEGG, 
                       test="hypergeometric",
                       background=50000, 
                       fdr=0.05, 
                       plotting=TRUE)

#plot_KEGG_down <- 
hyp_dots(hyp_KEGG_down, title="KEGG Pathways Downregulated in SLE patients", abrv=50, val = "fdr")

#ggsave("Dotplot_KEGG_downregulated.png")
# THIS WHERE WE AT BUT IT'S BEDTIME
sle_vs_n_gost <- gost(query = gene_list$uniprot_gn_symbol, 
                      organism = "mmusculus", 
                      sources = c("GO:BP", "KEGG", "REAC", "WP"),
                      significant = TRUE, 
                      correction_method ="fdr",
                      domain_scope="annotated")

gostplot(sle_vs_n_gost, interactive = TRUE)
# IDK WHAT THIS IS FOR
REACTOME <- msigdb_gsets(species="Mus musculus", 
                         category="C2", 
                         subcategory="CP:REACTOME")